library(tidyverse)
library(ggplot2)
library(ggpubr)
library(cowplot)
library(vegan)
samples.unknown <- species_table %>%
melt () %>%
group_by(variable) %>%
summarise(sum=sum(value)) %>%
filter(sum == 0) %>%
pluck(1) %>%
as.vector()
No id variables; using all as measure variables
# Remove samples with no known taxa
species_table <- species_table[, -which(names(species_table) %in% samples.unknown)]
species_table <- species_table[,rownames(map_table)] # reorder to match metadata
species_table <- species_table[,-c(grep("Control", map_table$sample.type))] # remove control samples
genus_table <- genus_table[,rownames(map_table)]
genus_table <- genus_table[,-c(grep("Control", map_table$sample.type))]
map_table <- map_table[-c(grep("Control", map_table$sample.type)), ]
map_table$sample.type = factor(map_table$sample.type)
all(row.names(map_table) %in% colnames(species_table))
[1] TRUE
# species_table <- species_table[,rownames(map_table)] # rereorder to match metadata
# species_table <- species_table[,-c(grep("FALSE", map_table$is.baseline))]
#map_table <- map_table[-c(grep("FALSE", map_table$is.baseline)), ]
# set sample type as factor
#all(row.names(map_table) %in% colnames(species_table)) # Sanity check
diversity_vec = matrix(nrow = dim(species_table)[2], ncol = 2)
diversity_vec = as.data.frame(diversity_vec)
for (a in 1:dim(species_table)[2]) {
diversity_vec[a,1] = diversity(species_table[,a], index = "shannon")
diversity_vec[a,2] = diversity(species_table[,a], index = "simpson")
}
colnames(diversity_vec) = c("Shannon", "Simpson")
# add sample.type factor
diversity_vec$sample.type = map_table$sample.type
diversity_vec$sample.type = factor(diversity_vec$sample.type) # Optional: Add Levels
ggplot(diversity_vec, aes(x = sample.type, y = Shannon, fill = sample.type)) +
geom_boxplot(outlier.shape = NA, alpha = 0.7) +
#geom_violin(alpha = 0.8) +
geom_jitter(size = 1, width = 0.1, alpha = 0.35) +
#scale_fill_manual(values=c("blue", "lightgreen", "yellow", "brown", "violet", "gray"),
# labels = c("Toothbrush", "HMP-Oral", "HMP-Skin", "HMP-Gut", "HMP-Vaginal", "Building dust")) +
theme_bw() +
theme(legend.position = "none",
axis.title.x = element_blank(),
#axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_text(size = 14),
axis.text.y = element_text(size = 14))
summary(aov(Shannon ~ sample.type, diversity_vec))
Df Sum Sq Mean Sq F value Pr(>F)
sample.type 2 0.96 0.4790 0.896 0.409
Residuals 223 119.14 0.5343
TukeyHSD(aov(Shannon ~ sample.type, diversity_vec))
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = Shannon ~ sample.type, data = diversity_vec)
$sample.type
diff lwr upr p adj
Pneumonia-NonPneumonia -0.01538685 -0.4412036 0.4104299 0.9960005
Unknown-NonPneumonia 0.20087248 -0.3418635 0.7436084 0.6577194
Unknown-Pneumonia 0.21625932 -0.1650512 0.5975698 0.3755754
# beta-diversity measure
beta <- vegdist(t(species_table), 'bray', binary = T)
beta.mat <- as.matrix(beta)
# projection
pcoa <- cmdscale(beta, k = 4, eig = TRUE)
# cleanup
ord <- as.data.frame(pcoa$points)
names(ord) <- c('pcoa1', 'pcoa2', 'pcoa3', 'pcoa4') ## rename coordinates
# add metadata
ord$Category = map_table$sample.type
ord$Outcome = map_table$binned_outcome
ord$Baseline = map_table$is.baseline
ord$Diagnosis = map_table$diagnosis.subtype
# Percent explained variation
eig <- eigenvals(pcoa)
eig.percent <- 100*head(eig/sum(eig))
eig.percent
[1] 27.568102 10.935027 9.314035 8.172572 5.591935 4.743136
eig.1 <- paste("PCo1 (", as.character(round(eig.percent[1], digits = 1)), "%)", sep = "")
eig.2 <-paste("PCo2 (", as.character(round(eig.percent[2], digits = 1)), "%)", sep = "")
## plot PCoA (FIGURE 1A)
ggplot(data = ord, aes(x = pcoa1, y = pcoa2, fill = Diagnosis)) +
geom_point(size = 3, stroke = 1, shape=21) +
theme_bw() +
xlab(eig.1) +
ylab(eig.2) +
theme_q2r() +
theme(axis.text = element_text(size=15, color = "black"),
axis.title = element_text(size=16, color = "black"),
legend.text = element_text(size=15, color = "black"),
legend.title = element_text(size=15, color = "black")) +
scale_fill_brewer(guide="legend", palette="Set1") +
#scale_shape_manual(values=c(21))+#, 22, 23, 24, 25, 26)) +
guides(fill=guide_legend(override.aes=list(shape=21))) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
ggsave("../../results/notebook_out/02A_MetaphlanPCoA.pdf", units="in", width=7, height=4.5)
Error in grDevices::pdf(file = filename, ..., version = version) :
cannot open file '../../results/notebook_out/02A_MetaphlanPCoA.pdf'
# effect of sample type
beta <- vegdist(t(species_table), 'bray', binary = T)
adonis_out <- adonis2(beta ~ sample.type, data = map_table, permutations = 999)
adonis_out
Permutation test for adonis under reduced model
Terms added sequentially (first to last)
Permutation: free
Number of permutations: 999
adonis2(formula = beta ~ sample.type, data = map_table, permutations = 999)
Df SumOfSqs R2 F Pr(>F)
sample.type 2 1.364 0.01712 1.9416 0.017 *
Residual 223 78.308 0.98288
Total 225 79.672 1.00000
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
sample.type.1 <- map_table$sample.type[1]
sample.type.1
[1] Pneumonia
Levels: NonPneumonia Pneumonia Unknown
mean(as.matrix(vegdist(t(species_table[,grep(sample.type.1,map_table$sample.type)]),
'jaccard', binary = T)))
[1] 0.8587207
sd(as.matrix(vegdist(t(species_table[,grep(sample.type.1,map_table$sample.type)]),
'jaccard', binary = T)))/sqrt(34)
[1] 0.03923536
min_mean_proportion <- .00001
min_prevalence <- 20
species.ranked <- species_table %>%
rownames_to_column() %>%
melt() %>%
filter(value > 0) %>% # filter 0 values to get prevalence
group_by(rowname) %>%
summarise(mean = mean(value), n = n(), sum=sum(value)) %>%
arrange(desc(mean)) %>%
#filter(mean > min_mean_proportion) %>% # Filter by mean percent
filter(n > min_prevalence) %>% # Filter by prevalence
pluck(1)
Using rowname as id variables
length(species.ranked)
[1] 19
species_table.heat <- species_table[species.ranked,] %>%
rownames_to_column() %>%
separate(rowname, into = c("ExtraTaxa", "Taxa"), sep="s__") %>%
select(!ExtraTaxa) %>%
column_to_rownames("Taxa") %>%
as.matrix
library(pheatmap)
annot.col <- map_table %>% select(sample.type, binned_outcome, is.baseline)
paletteLength <- 100
myColors <- rev(colorRampPalette(rev(brewer.pal(n = 9, name ="Reds")))(paletteLength))
myColors <- rev(colorRampPalette(rev(brewer.pal(n = 9, name ="RdBu")))(paletteLength))
#pdf(file="../../results/notebook_out/02A_MetaphlanHeatPlot.pdf", width = 25, height = 20)
pheatmap(species_table.heat,
color = myColors,
annotation_col = annot.col,
angle_col = "45",
show_colnames=FALSE,
show_rownames = FALSE, scale="row")
#dev.off()
min_mean_proportion <- .00001
min_prevalence <- 20
genera.ranked <- genus_table %>%
rownames_to_column() %>%
melt() %>%
filter(value > 0) %>% # filter 0 values to get prevalence
group_by(rowname) %>%
summarise(mean = mean(value), n = n(), sum=sum(value)) %>%
arrange(desc(mean)) %>%
#filter(mean > min_mean_proportion) %>% # Filter by mean percent
filter(n > min_prevalence) %>% # Filter by prevalence
pluck(1)
Using rowname as id variables
length(genera.ranked)
[1] 13
# Order by species rank object and remove extra taxnomic info
genus_table.heat <- genus_table[genera.ranked,] %>%
rownames_to_column() %>%
separate(rowname, into = c("ExtraTaxa", "Taxa"), sep="g__") %>%
select(!ExtraTaxa) %>%
column_to_rownames("Taxa") %>%
as.matrix